{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "d70938ec",
   "metadata": {},
   "source": [
    "<a href=\"https://colab.research.google.com/github/jerryjliu/llama_index/blob/main/docs/examples/data_connectors/html_tag_reader.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8e658683-746c-492c-9aeb-2aaf3ec560b0",
   "metadata": {},
   "source": [
    "# HTML Tag Reader"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a0899b5e",
   "metadata": {},
   "source": [
    "If you're opening this Notebook on colab, you will probably need to install LlamaIndex 🦙."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "753398e9",
   "metadata": {},
   "outputs": [],
   "source": [
    "%pip install llama-index-readers-file"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c4c71498",
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install llama-index"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b20aec97-43e5-49da-a57d-7a79bfe1c471",
   "metadata": {},
   "source": [
    "### Download HTML file"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e07f31eb-65bb-4439-a1b1-6de8bfad67dd",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Both --no-clobber and --convert-links were specified, only --convert-links will be used.\n",
      "--2023-09-07 16:36:36--  https://docs.ray.io/en/master/ray-overview/installation.html\n",
      "Resolving docs.ray.io (docs.ray.io)... 104.18.1.163, 104.18.0.163\n",
      "Connecting to docs.ray.io (docs.ray.io)|104.18.1.163|:443... connected.\n",
      "HTTP request sent, awaiting response... 200 OK\n",
      "Length: unspecified [text/html]\n",
      "Saving to: ‘data/docs.ray.io/en/master/ray-overview/installation.html’\n",
      "\n",
      "     0K .......... .......... .......... .......... ..........  125M\n",
      "    50K .......... .......... .......... .......... .......... 21.4M\n",
      "   100K .......... .......... .......... ........              1.01M=0.04s\n",
      "\n",
      "2023-09-07 16:36:37 (3.37 MB/s) - ‘data/docs.ray.io/en/master/ray-overview/installation.html’ saved [142067]\n",
      "\n",
      "FINISHED --2023-09-07 16:36:37--\n",
      "Total wall clock time: 0.3s\n",
      "Downloaded: 1 files, 139K in 0.04s (3.37 MB/s)\n",
      "Converting links in data/docs.ray.io/en/master/ray-overview/installation.html... 116.\n",
      "48-68\n",
      "Converted links in 1 files in 0.002 seconds.\n"
     ]
    }
   ],
   "source": [
    "%%bash\n",
    "wget -e robots=off --no-clobber --page-requisites \\\n",
    "  --html-extension --convert-links --restrict-file-names=windows \\\n",
    "  --domains docs.ray.io --no-parent --accept=html \\\n",
    "  -P data/ https://docs.ray.io/en/master/ray-overview/installation.html"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0d63dc1d-0c63-4c89-9693-324a6f026811",
   "metadata": {},
   "outputs": [],
   "source": [
    "from llama_index.readers.file import HTMLTagReader\n",
    "\n",
    "reader = HTMLTagReader(tag=\"section\", ignore_no_id=True)\n",
    "docs = reader.load_data(\n",
    "    \"data/docs.ray.io/en/master/ray-overview/installation.html\"\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "79869054-0200-4642-a47e-c4b94c2164c4",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'tag': 'section', 'tag_id': 'installing-ray', 'file_path': 'data/docs.ray.io/en/master/ray-overview/installation.html'}\n",
      "{'tag': 'section', 'tag_id': 'official-releases', 'file_path': 'data/docs.ray.io/en/master/ray-overview/installation.html'}\n",
      "{'tag': 'section', 'tag_id': 'from-wheels', 'file_path': 'data/docs.ray.io/en/master/ray-overview/installation.html'}\n",
      "{'tag': 'section', 'tag_id': 'daily-releases-nightlies', 'file_path': 'data/docs.ray.io/en/master/ray-overview/installation.html'}\n",
      "{'tag': 'section', 'tag_id': 'installing-from-a-specific-commit', 'file_path': 'data/docs.ray.io/en/master/ray-overview/installation.html'}\n",
      "{'tag': 'section', 'tag_id': 'install-ray-java-with-maven', 'file_path': 'data/docs.ray.io/en/master/ray-overview/installation.html'}\n",
      "{'tag': 'section', 'tag_id': 'install-ray-c', 'file_path': 'data/docs.ray.io/en/master/ray-overview/installation.html'}\n",
      "{'tag': 'section', 'tag_id': 'm1-mac-apple-silicon-support', 'file_path': 'data/docs.ray.io/en/master/ray-overview/installation.html'}\n",
      "{'tag': 'section', 'tag_id': 'windows-support', 'file_path': 'data/docs.ray.io/en/master/ray-overview/installation.html'}\n",
      "{'tag': 'section', 'tag_id': 'installing-ray-on-arch-linux', 'file_path': 'data/docs.ray.io/en/master/ray-overview/installation.html'}\n",
      "{'tag': 'section', 'tag_id': 'installing-from-conda-forge', 'file_path': 'data/docs.ray.io/en/master/ray-overview/installation.html'}\n",
      "{'tag': 'section', 'tag_id': 'building-ray-from-source', 'file_path': 'data/docs.ray.io/en/master/ray-overview/installation.html'}\n",
      "{'tag': 'section', 'tag_id': 'docker-source-images', 'file_path': 'data/docs.ray.io/en/master/ray-overview/installation.html'}\n",
      "{'tag': 'section', 'tag_id': 'launch-ray-in-docker', 'file_path': 'data/docs.ray.io/en/master/ray-overview/installation.html'}\n",
      "{'tag': 'section', 'tag_id': 'test-if-the-installation-succeeded', 'file_path': 'data/docs.ray.io/en/master/ray-overview/installation.html'}\n",
      "{'tag': 'section', 'tag_id': 'installed-python-dependencies', 'file_path': 'data/docs.ray.io/en/master/ray-overview/installation.html'}\n"
     ]
    }
   ],
   "source": [
    "for doc in docs:\n",
    "    print(doc.metadata)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
